Hypothesis Tests on Audio Features

Two sample - independent tests

This is an example of the hypothesis tests which were carried out on each audio feature.

H0: The mean danceability in 1960s is the same as the mean danceability in 2010s

Ha: The mean danceability in 1960s is less than the mean danceability in 2010s

H0: The difference in means in 0 Ha: danceability2020 - danceability1960 > 0

How do the audio features change over time

spotify_data_clean_pivot <- spotify_data_clean_join %>% 
  pivot_longer(cols = acousticness:valence,
              names_to = "audio_feature",
              values_to = "value") %>% 
  group_by(year, audio_feature) %>% 
  mutate(avg_feature = mean(value))

Plot with all Audio Features

Plot with audio features showing change

spotify_data_clean_join %>% 
  #mutate(explicit = as.numeric(explicit)) %>% 
  group_by(year) %>% 
  summarise(total_ex = sum(explicit)) %>% 
  ggplot() +
  aes(x = year, y = total_ex / 20) +
  geom_line(colour = "red", linewidth = 2) +
  ylim(0, 50) +
  labs(x = "Year",
       y = "Proportion")+
       # title = "Proportion of Explicit Songs",
       # subtitle = "by year") +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times"),
        legend.title = element_blank()
  ) 

ggsave("plot_images/explicit.png", dpi = 720, width = 10, height = 6)

spotify_data_clean_join %>% 
  group_by(year) %>% 
  summarise(mean_pop = mean(popularity)) %>% 
  ggplot() +
  aes(x = year, y = mean_pop) +
  geom_line(colour = "purple2", linewidth = 2) +
  labs(x = "Year",
       y = "Popularity") +
  ylim(0, 100) +
  theme_bw() +
  scale_colour_brewer(palette =  "Dark2") +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  ) 

ggsave("plot_images/avg_pop.png", dpi = 720, width = 10, height = 6)

spot_sample %>% 
  ggplot() +
  aes(x = danceability, y = popularity) +
  geom_point(colour = "#1DB954", alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, colour = "purple3", linewidth = 2) +
  labs(x = "Danceability",
       y = "Popularity") +
       theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 20, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )


#ggsave("plot_images/pop_vs_dance.png", dpi = 720, width = 10, height = 6)
spot_sample %>% 
  ggplot() +
  aes(x = loudness, y = popularity) +
  geom_point(colour = "#1DB954", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, colour = "purple3", linewidth = 2) +
  labs(x = "Loudness",
       y = "Popularity") +
       theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )

ggsave("plot_images/pop_vs_loud.png", dpi = 720, width = 10, height = 6)

Intro To Linear Regression

To help me answer my question of what makes a song “popular” on spotify, I decided to build an explanatory linear regression model.

This type of analysis is used to determine the strength of the relationship between a response variable and multiple explanatory variables.

So in this case I will be choosing from all of the variables I have just discussed to explain the popularity of a song on spotify.

y = b0 + b1x1 + b2x2 + b3x3….bnxn

popularity = b0 + b1x1 + b2x2 + b3x3….bnxn

To start this process I plot popularity against each one of my variables or possible explanatory variables and find the strongest correlation.

n_data <- nrow(spotify_data_for_modelling)

sample_index <- sample(1:n_data, size = n_data*0.1)

spot_sample <- slice(spotify_data_for_modelling, sample_index)

spot_sample %>% 
  ggplot() +
  aes(x = year, y = popularity) +
  geom_point(alpha = 0.7, colour = "#1DB954") +
  geom_smooth(method = "lm", se = FALSE, colour = "purple2") +
  theme_classic() +
  labs(x = "Year",
       y = "Popularity") +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )


#ggsave("plot_images/pop_vs_dance.png", dpi = 720, width = 10, height = 6)
spot_sample %>% 
  ggplot() +
  aes(x = explicit, y = popularity) +
  geom_point(alpha = 0.7, colour = "red4") +
  geom_smooth(method = "lm", se = FALSE)

My strongest correlation was year (the year the track was released) with a correlation of 0.74. I then add this to my model as my first explanatory variable.

model_1a <- lm(popularity ~ year,
               data = train_lm)

summary(model_1a)

Call:
lm(formula = popularity ~ year, data = train_lm)

Residuals:
    Min      1Q  Median      3Q     Max 
-19.600  -7.447  -1.773   5.668  54.400 

Coefficients:
                Estimate   Std. Error t value            Pr(>|t|)    
(Intercept) -1220.936768     3.742482  -326.2 <0.0000000000000002 ***
year            0.634644     0.001881   337.4 <0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 10.08 on 95918 degrees of freedom
Multiple R-squared:  0.5427,    Adjusted R-squared:  0.5427 
F-statistic: 1.138e+05 on 1 and 95918 DF,  p-value: < 0.00000000000000022

After running my model I’m looking at 3 factors: - The P-value - is this variable making a significant difference. If the P-value is below the significance level of 0.05 then we can reject the null hypothesis and conclude that correlation between the variables is significant

  • The R^2 - is a measure that indicates how much of the variation of popularity is explained by the year
  • The adjusted R^2 - compensates for the addition of variables. So as we’re building an explanatory model, we don’t want this to drop much lower than the r^2.
model_3a <- lm(popularity ~ year + danceability + loudness,
               data = train_lm)

summary(model_3a)
model_3b <- lm(popularity ~ year + danceability + instrumentalness,
               data = train_lm)

summary(model_3b)
anova(model_3a, model_2c)

Final Model

So here we have our final model. I stopped adding variables as the Adjusted r^2 started dropping and our multiple r^2 was barely going up.

So we can see that the more recent the release, the more danceability, the louder it is, the less liveness it has. So you don’t want a live recording, you want a studio recording with some explicit lyrics chucked in there as well.

All of our P-Values are significant and we have a Multiple r^2 of 0.55, with an adjusted r^2 also of 0.55. This means that 55% of the variance in popularity is explained our other variables. Which also means that 45% of the variability in the data cannot be explained by this model.

55% isn’t a very high proportion but it’s not terrible. We are trying to measure the popularity of a song. It’s not quite as obvious as if we were trying to explain the value of a house. We might find that the postcode and the number of rooms goes a long way to explaining that value. I think explaining the popularity of a song is bit more complicated than that. If it was easy to know what made a song popular then we’d all be musicians.

Mention human behaviour, mention the basic practice of statistics would call this a moderate effect size.

So my model isn’t great. What can I do to improve this? Well…

When I was researching how spotify calculate their popularity score I kept coming across this one phrase.

50 is the magic number

The higher your popularity index, the more likely the algorithm is to recommend you to new listeners, and place you in algorithmic playlists like Release Radar and Discover Weekly. Many websites and blogs had theories on what number you had to hit to be added to certain editorial playlists.

DIYMusician suggests that and popularity of 20+ in the first few weeks of release will get you onto the Release Radar playlist and a popularity score of 30+ will get you onto Discover Weekly. Loudlab suggested that 50 is the magic number. I liked this idea of having a threshold of whether or not a song is popular. So going back to my variables I created a column called is_popular which is only TRUE if the popularity of the song is 50 or over. This also lends itself nicely to a logistic regression model.

Logistic regression is a statistical analysis method to predict, or explain a binary outcome, such as yes or no, based on prior observations of a data set.

So rather than using the popularity score I have used the variable I created called is_popular, which splits the data in to a logical type, so it’s TRUE if the song is 50 or above.

The way this is built is very similar to the linear model. I look for correlations, and I add them to my model 1 at a time, check they are significant. The main difference is that I’m looking for for a high AUC score this time, rather than the multiple r^squared I was looking for in the linear model.

I’ll explain the AUC in a second. I decided for this model to make a couple of changes. I decided to no longer include the year or decade the song was released. If I’m staying true to my original question then how could I possibly write a song that was released in the past. It had such a large influence on the linear model I thought it would be more interesting to see how the logistic model fared without it. And then I can test my model against different decades to see how it performs.

Oscar Wilde is famously credited with having written: “Popularity is the one insult I have never suffered.”

spot_sample %>% 
  mutate(explicit = as.logical(explicit)) %>% 
  ggplot() +
  aes(x = loudness, y = popularity, colour = explicit) +
  geom_point(alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Loudness",
       y = "Popularity",
       title = "Relationship between Popularity and Loundess",
       subtitle = "Grouped by explicit or not") +
    scale_colour_brewer(palette =  "Dark2") +
  theme_bw() +
  theme(axis.text  = element_text(face = "bold", size = 15, family = "Times"),
        axis.title  = element_text(face = "bold", size = 15, family = "Times"),
        title = element_text(face = "bold", size = 15, family = "Times"),
        legend.text = element_text(face = "bold", size = 10, family = "Times")
  ) 

spot_sample %>% 
  mutate(explicit = as.logical(explicit)) %>%
ggplot() +
  aes(x = loudness, y = as.integer(is_popular), colour = explicit) +
  geom_jitter(shape = 1,
              position = position_jitter(h = 0.05, w = 0.05),
              alpha = 0.8) +
   geom_line(data = train_model_1_loud, aes(x = loudness , y = pred), col = 'red') +
  ylab("Probability") +
      scale_colour_brewer(palette =  "Dark2") +
  theme_bw() +
  theme(axis.text  = element_text(face = "bold", size = 15, family = "Times"),
        axis.title  = element_text(face = "bold", size = 15, family = "Times"),
        title = element_text(face = "bold", size = 15, family = "Times"),
        legend.text = element_text(face = "bold", size = 10, family = "Times")
  ) 



# ggplot(mortgage_data) +
#   geom_jitter(aes(x = tu_score, y = as.integer(accepted)), shape = 1, 
#               position = position_jitter(h = 0.03)) + 
#    geom_line(data = predict_log, aes(x = tu_score , y = pred), col = 'red') + 
#   ylab("Probability")

Final Logistic Regression Model

model_4_final <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
             family = "binomial",
             data = train_log_mod)

summary(model_4_final)

Call:
glm(formula = is_popular ~ loudness + explicit + danceability + 
    no_of_artists, family = "binomial", data = train_log_mod)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3012  -0.8550  -0.6624   1.1355   3.7138  

Coefficients:
                Estimate Std. Error z value            Pr(>|z|)    
(Intercept)   -7.7547047  0.0993408  -78.06 <0.0000000000000002 ***
loudness       0.0780446  0.0012231   63.81 <0.0000000000000002 ***
explicit       1.0409331  0.0237662   43.80 <0.0000000000000002 ***
danceability   0.0095757  0.0004548   21.06 <0.0000000000000002 ***
no_of_artists  0.1844060  0.0114087   16.16 <0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 121164  on 97324  degrees of freedom
Residual deviance: 110272  on 97320  degrees of freedom
AIC: 110282

Number of Fisher Scoring iterations: 4
tidy(model_4_final)

train_model_4_final <- train_log_mod %>%
  add_predictions(model_4_final, type = "response")

roc_obj_mod4 <- train_model_4_final %>%
  roc(response = is_popular, predictor = pred)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4
  ), 
  legacy.axes = TRUE) +
  coord_fixed() + 
  theme_classic()

auc(roc_obj_mod4)
Area under the curve: 0.7223
log_mod_1990 <- train_log_mod %>% 
  filter(decade == 1990 |
           decade == 2000 |
           decade == 2010)

model_90_00_10 <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
                      family = "binomial",
                      data = log_mod_1990)

summary(model_90_00_10)

Call:
glm(formula = is_popular ~ loudness + explicit + danceability + 
    no_of_artists, family = "binomial", data = log_mod_1990)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2496  -1.1659   0.8386   1.1431   2.3491  

Coefficients:
                Estimate Std. Error z value             Pr(>|z|)    
(Intercept)   -3.1595029  0.1137598 -27.773 < 0.0000000000000002 ***
loudness       0.0342887  0.0014178  24.185 < 0.0000000000000002 ***
explicit       0.4662174  0.0258787  18.016 < 0.0000000000000002 ***
danceability   0.0031530  0.0005672   5.559         0.0000000271 ***
no_of_artists  0.1691662  0.0143345  11.801 < 0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 66344  on 47879  degrees of freedom
Residual deviance: 64758  on 47875  degrees of freedom
AIC: 64768

Number of Fisher Scoring iterations: 4
tidy(model_90_00_10)

train_model_90_00_10 <- log_mod_1990 %>%
  add_predictions(model_90_00_10, type = "response")

roc_obj_mod_90_00_10 <- train_model_90_00_10 %>%
  roc(response = is_popular, predictor = pred)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4,
    dec_60s_70s_80s = roc_obj_mod_60_70_80,
    dec_90s_00s_10s = roc_obj_mod_90_00_10
  ), 
  legacy.axes = TRUE,
  linewidth = 2) +
  coord_fixed() +
  labs(x = "1 - Specificity",
       y = "Sensitivity") +
  scale_colour_brewer(palette =  "Dark2") +
  scale_colour_discrete(labels = c("Final Model", "60s, 70s & 80s", "90s, 00s & 10s")) +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 18, family = "Times"),
        #legend.text = element_blank(),
        legend.title = element_blank()
        #legend.key = element_blank()
  )
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
auc(roc_obj_mod_90_00_10)
Area under the curve: 0.629
roc_curve

ggsave("plot_images/log_models.png", dpi = 720, width = 12, height = 6)


log_mod_1990 <- train_log_mod %>% 
  filter(decade == 1990 |
           decade == 2000 |
           decade == 2010)

model_90_00_10 <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
                      family = "binomial",
                      data = log_mod_1990)

summary(model_90_00_10)

Call:
glm(formula = is_popular ~ loudness + explicit + danceability + 
    no_of_artists, family = "binomial", data = log_mod_1990)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2496  -1.1659   0.8386   1.1431   2.3491  

Coefficients:
                Estimate Std. Error z value             Pr(>|z|)    
(Intercept)   -3.1595029  0.1137598 -27.773 < 0.0000000000000002 ***
loudness       0.0342887  0.0014178  24.185 < 0.0000000000000002 ***
explicit       0.4662174  0.0258787  18.016 < 0.0000000000000002 ***
danceability   0.0031530  0.0005672   5.559         0.0000000271 ***
no_of_artists  0.1691662  0.0143345  11.801 < 0.0000000000000002 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 66344  on 47879  degrees of freedom
Residual deviance: 64758  on 47875  degrees of freedom
AIC: 64768

Number of Fisher Scoring iterations: 4
tidy(model_90_00_10)

train_model_90_00_10 <- log_mod_1990 %>%
  add_predictions(model_90_00_10, type = "response")

roc_obj_mod_90_00_10 <- train_model_90_00_10 %>%
  roc(response = is_popular, predictor = pred)
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4
  ), 
  legacy.axes = TRUE,
  linewidth = 2) +
  coord_fixed() +
  labs(x = "1 - Specificity",
       y = "Sensitivity") +
  scale_colour_brewer(palette =  "Dark2") +
  scale_colour_discrete(labels = "Final Model") +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 16, family = "Times"),
        legend.title = element_blank()
  )
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
auc(roc_obj_mod_90_00_10)
Area under the curve: 0.629
roc_curve

ggsave("plot_images/log_best_model.png", dpi = 720, width = 12, height = 6)

---
title: "Spotify Analysis"
output: html_notebook
---

```{r include=FALSE}
library(spotifyr)
library(tidyverse)
library(modelr)
library(infer)
library(modelr)
library(ggfortify)
library(GGally)
```

```{r include=FALSE}
spotify_data_clean_join <- read_csv("clean_data/spotify_clean_join.csv")
spotify_data <- read_csv("raw_data/spotify_data.csv")
```


## Hypothesis Tests on Audio Features

Two sample - independent tests

This is an example of the hypothesis tests which were carried out on each audio feature.

H0: The mean danceability in 1960s is the same as the mean danceability in 2010s

Ha: The mean danceability in 1960s is less than the mean danceability in 2010s

H0: The difference in means in 0
Ha: danceability2020 - danceability1960 > 0
```{r include=FALSE}
dance_decade_hyp <- spotify_data_clean_join %>% 
  select(decade, danceability) %>% 
  filter(decade == 1960 | decade == 2010) %>% 
  mutate(decade = as.factor(decade))

null_distribution <- dance_decade_hyp %>% 
  specify(danceability ~ decade) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 5000, type = "permute") %>% 
  calculate(stat = "diff in means", order = c("2010", "1960")) 

observed_stat <- dance_decade_hyp %>% 
  specify(danceability ~ decade) %>%
  calculate(stat = "diff in means", order = c("2010", "1960"))

null_distribution %>%
  visualise() +
  shade_p_value(obs_stat = observed_stat, direction = "right")

p_value <- null_distribution %>%
  get_p_value(obs_stat = observed_stat, direction = "right")
```

### How do the audio features change over time

```{r}
spotify_data_clean_pivot <- spotify_data_clean_join %>% 
  pivot_longer(cols = acousticness:valence,
              names_to = "audio_feature",
              values_to = "value") %>% 
  group_by(year, audio_feature) %>% 
  mutate(avg_feature = mean(value))
```


Plot with all Audio Features
```{r}
audiot_features_all <- spotify_data_clean_pivot %>%
  ggplot() +
  aes(x = year, y = avg_feature, group = audio_feature, 
      colour = audio_feature) +
  geom_line(linewidth = 2) +
  ylim(0, 100) +
  labs(x = "Year",
       y = "Value",
       title = "Audio Features Over Time",
       subtitle = "Avg per year",
       colour = "Audio Feature") +
  theme_bw() +
  scale_colour_brewer(palette =  "Dark2") +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 15, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  ) 

#ggsave(audiot_features_all, "plot_images/audio_features_all.png")
```

Plot with audio features showing change
```{r}
audio_features_cut <- spotify_data_clean_pivot %>%
    filter(audio_feature == "acousticness" |
           audio_feature == "instrumentalness" |
           audio_feature == "danceability" |
           audio_feature == "speechiness" |
           audio_feature == "loudness") %>% 
  ggplot() +
  aes(x = year, y = avg_feature, group = audio_feature, 
      colour = audio_feature) +
  geom_line(linewidth = 2) +
  ylim(0, 100) +
  labs(x = "Year",
       y = "Value",
       # title = "Audio Features by Release Year",
       # subtitle = "Avg per year",
       colour = "Audio Feature") +
  theme_bw() +
  scale_colour_brewer(palette =  "Dark2") +
  scale_colour_discrete(labels = c("Acousticness", "Danceability", "Instrumentalness",
                                   "Loudness", "Speechiness")) +

  theme(axis.text  = element_text(face = "bold", size = 20, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 20, family = "Times"),
        legend.title = element_blank()
  ) 

ggsave("plot_images/audio_features_cut.png", dpi = 720, width = 12, height = 6)

?ggsave
```

```{r}
spotify_data_clean_join %>% 
  #mutate(explicit = as.numeric(explicit)) %>% 
  group_by(year) %>% 
  summarise(total_ex = sum(explicit)) %>% 
  ggplot() +
  aes(x = year, y = total_ex / 20) +
  geom_line(colour = "red", linewidth = 2) +
  ylim(0, 50) +
  labs(x = "Year",
       y = "Proportion")+
       # title = "Proportion of Explicit Songs",
       # subtitle = "by year") +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  ) 

ggsave("plot_images/explicit.png", dpi = 720, width = 10, height = 6)
```
```{r}
spotify_data_clean_join %>% 
  group_by(year) %>% 
  summarise(mean_pop = mean(popularity)) %>% 
  ggplot() +
  aes(x = year, y = mean_pop) +
  geom_line(colour = "purple2", linewidth = 2) +
  labs(x = "Year",
       y = "Popularity") +
  ylim(0, 100) +
  theme_bw() +
  scale_colour_brewer(palette =  "Dark2") +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  ) 

ggsave("plot_images/avg_pop.png", dpi = 720, width = 10, height = 6)

```

```{r}
spot_sample %>% 
  ggplot() +
  aes(x = danceability, y = popularity) +
  geom_point(colour = "#1DB954", alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, colour = "purple3", linewidth = 2) +
  labs(x = "Danceability",
       y = "Popularity") +
       theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 20, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )

#ggsave("plot_images/pop_vs_dance.png", dpi = 720, width = 10, height = 6)

```

```{r}
spot_sample %>% 
  ggplot() +
  aes(x = loudness, y = popularity) +
  geom_point(colour = "#1DB954", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, colour = "purple3", linewidth = 2) +
  labs(x = "Loudness",
       y = "Popularity") +
       theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 22, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )

ggsave("plot_images/pop_vs_loud.png", dpi = 720, width = 10, height = 6)

```

### Intro To Linear Regression

To help me answer my question of what makes a song "popular" on spotify, I decided to build an explanatory linear regression model. 

This type of analysis is used to determine the strength of the relationship between a response variable and multiple explanatory variables.

So in this case I will be choosing from all of the variables I have just discussed to explain the popularity of a song on spotify.

y = b0 + b1x1 + b2x2 + b3x3....bnxn

popularity = b0 + b1x1 + b2x2 + b3x3....bnxn

To start this process I plot popularity against each one of my variables or possible explanatory variables and find the strongest correlation.

```{r}
n_data <- nrow(spotify_data_for_modelling)

sample_index <- sample(1:n_data, size = n_data*0.1)

spot_sample <- slice(spotify_data_for_modelling, sample_index)

spot_sample %>% 
  ggplot() +
  aes(x = year, y = popularity) +
  geom_point(alpha = 0.7, colour = "#1DB954") +
  geom_smooth(method = "lm", se = FALSE, colour = "purple2") +
  theme_classic() +
  labs(x = "Year",
       y = "Popularity") +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 14, family = "Times")
  )

#ggsave("plot_images/pop_vs_dance.png", dpi = 720, width = 10, height = 6)
```

```{r}
spot_sample %>% 
  ggplot() +
  aes(x = explicit, y = popularity) +
  geom_point(alpha = 0.7, colour = "red4") +
  geom_smooth(method = "lm", se = FALSE)
```


My strongest correlation was year (the year the track was released) with a correlation of 0.74. I then add this to my model as my first explanatory variable.

```{r}
model_1a <- lm(popularity ~ year,
               data = train_lm)

summary(model_1a)
```

After running my model I'm looking at 3 factors:
 - The P-value - is this variable making a significant difference. If the P-value is below the significance level of 0.05 then we can reject the null hypothesis and conclude that correlation between the variables is significant
 
 - The R^2 - is a measure that indicates how much of the variation of popularity is explained by the year
 - The adjusted R^2 - compensates for the addition of variables. So as we're building an explanatory model, we don't want this to drop much lower than the r^2.

```{r}
model_3a <- lm(popularity ~ year + danceability + loudness,
               data = train_lm)

summary(model_3a)
```

```{r}
model_3b <- lm(popularity ~ year + danceability + instrumentalness,
               data = train_lm)

summary(model_3b)
```

```{r}
anova(model_3a, model_2c)
```

Final Model

```{r}
model_6a <- lm(popularity ~ year + danceability + loudness + liveness + explicit,
               data = train_lm)

summary(model_6a)
``` 

So here we have our final model. I stopped adding variables as the Adjusted r^2 started dropping and our multiple r^2 was barely going up.

So we can see that the more recent the release, the more danceability, the louder it is, the less liveness it has. So you don't want a live recording, you want a studio recording with some explicit lyrics chucked in there as well.

All of our P-Values are significant and we have a Multiple r^2 of 0.55, with an adjusted r^2 also of 0.55. This means that 55% of the variance in popularity is explained our other variables. Which  also means that 45% of the variability in the data cannot be explained by this model.

55% isn't a very high proportion but it's not terrible. We are trying to measure the popularity of a song. It's not quite as obvious as if we were trying to explain the value of a house. We might find that the postcode and the number of rooms goes a long way to explaining that value. I think explaining the popularity of a song is bit more complicated than that. If it was easy to know what made a song popular then we'd all be musicians.

Mention human behaviour, mention the basic practice of statistics would call this a moderate effect size. 


```{r}
model_6b <- lm(popularity ~ year + danceability + loudness + liveness + explicit,
               data = test_lm)

summary(model_6b)

``` 

So my model isn't great. What can I do to improve this? Well...

When I was researching how spotify calculate their popularity score I kept coming across this one phrase.

### 50 is the magic number

The higher your popularity index, the more likely the algorithm is to recommend you to new listeners, and place you in algorithmic playlists like Release Radar and Discover Weekly. Many websites and blogs had theories on what number you had to hit to be added to certain editorial playlists. 

DIYMusician suggests that and popularity of 20+ in the first few weeks of release will get you onto the Release Radar playlist and a popularity score of 30+ will get you onto Discover Weekly. Loudlab suggested that 50 is the magic number. I liked this idea of having a threshold of whether or not a song is popular. So going back to my variables I created a column called is_popular which is only TRUE if the popularity of the song is 50 or over. This also lends itself nicely to a logistic regression model. 

Logistic regression is a statistical analysis method to predict, or explain a binary outcome, such as yes or no, based on prior observations of a data set.

So rather than using the popularity score I have used the variable I created called is_popular, which splits the data in to a logical type, so it's TRUE if the song is 50 or above.

The way this is built is very similar to the linear model. I look for correlations, and I add them to my model 1 at a time, check they are significant. The main difference is that I'm looking for for a high AUC score this time, rather than the multiple r^squared I was looking for in the linear model.

I'll explain the AUC in a second. I decided for this model to make a couple of changes. I decided to no longer include the year or decade the song was released. If I'm staying true to my original question then how could I possibly write a song that was released in the past. It had such a large influence on the linear model I thought it would be more interesting to see how the logistic model fared without it. And then I can test my model against different decades to see how it performs.


Oscar Wilde is famously credited with having written: “Popularity is the one insult I have never suffered.”

```{r}
spot_sample %>% 
  mutate(explicit = as.logical(explicit)) %>% 
  ggplot() +
  aes(x = loudness, y = popularity, colour = explicit) +
  geom_point(alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Loudness",
       y = "Popularity",
       title = "Relationship between Popularity and Loundess",
       subtitle = "Grouped by explicit or not") +
    scale_colour_brewer(palette =  "Dark2") +
  theme_bw() +
  theme(axis.text  = element_text(face = "bold", size = 15, family = "Times"),
        axis.title  = element_text(face = "bold", size = 15, family = "Times"),
        title = element_text(face = "bold", size = 15, family = "Times"),
        legend.text = element_text(face = "bold", size = 10, family = "Times")
  ) 
```


```{r}
spot_sample %>% 
  mutate(explicit = as.logical(explicit)) %>%
ggplot() +
  aes(x = loudness, y = as.integer(is_popular), colour = explicit) +
  geom_jitter(shape = 1,
              position = position_jitter(h = 0.05, w = 0.05),
              alpha = 0.8) +
   geom_line(data = train_model_1_loud, aes(x = loudness , y = pred), col = 'red') +
  ylab("Probability") +
      scale_colour_brewer(palette =  "Dark2") +
  theme_bw() +
  theme(axis.text  = element_text(face = "bold", size = 15, family = "Times"),
        axis.title  = element_text(face = "bold", size = 15, family = "Times"),
        title = element_text(face = "bold", size = 15, family = "Times"),
        legend.text = element_text(face = "bold", size = 10, family = "Times")
  ) 


# ggplot(mortgage_data) +
#   geom_jitter(aes(x = tu_score, y = as.integer(accepted)), shape = 1, 
#               position = position_jitter(h = 0.03)) + 
#    geom_line(data = predict_log, aes(x = tu_score , y = pred), col = 'red') + 
#   ylab("Probability")
```


Final Logistic Regression Model
```{r}
model_4_final <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
             family = "binomial",
             data = train_log_mod)

summary(model_4_final)

tidy(model_4_final)

train_model_4_final <- train_log_mod %>%
  add_predictions(model_4_final, type = "response")

roc_obj_mod4 <- train_model_4_final %>%
  roc(response = is_popular, predictor = pred)

roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4
  ), 
  legacy.axes = TRUE) +
  coord_fixed() + 
  theme_classic()

auc(roc_obj_mod4)
```


```{r}
log_mod_1990 <- train_log_mod %>% 
  filter(decade == 1990 |
           decade == 2000 |
           decade == 2010)

model_90_00_10 <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
                      family = "binomial",
                      data = log_mod_1990)

summary(model_90_00_10)

tidy(model_90_00_10)

train_model_90_00_10 <- log_mod_1990 %>%
  add_predictions(model_90_00_10, type = "response")

roc_obj_mod_90_00_10 <- train_model_90_00_10 %>%
  roc(response = is_popular, predictor = pred)


roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4,
    dec_60s_70s_80s = roc_obj_mod_60_70_80,
    dec_90s_00s_10s = roc_obj_mod_90_00_10
  ), 
  legacy.axes = TRUE,
  linewidth = 2) +
  coord_fixed() +
  labs(x = "1 - Specificity",
       y = "Sensitivity") +
  scale_colour_brewer(palette =  "Dark2") +
  scale_colour_discrete(labels = c("Final Model", "60s, 70s & 80s", "90s, 00s & 10s")) +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 18, family = "Times"),
        #legend.text = element_blank(),
        legend.title = element_blank()
        #legend.key = element_blank()
  )

auc(roc_obj_mod_90_00_10)

roc_curve

ggsave("plot_images/log_models.png", dpi = 720, width = 12, height = 6)

```


```{r}

log_mod_1990 <- train_log_mod %>% 
  filter(decade == 1990 |
           decade == 2000 |
           decade == 2010)

model_90_00_10 <- glm(is_popular ~ loudness + explicit + danceability + no_of_artists,
                      family = "binomial",
                      data = log_mod_1990)

summary(model_90_00_10)

tidy(model_90_00_10)

train_model_90_00_10 <- log_mod_1990 %>%
  add_predictions(model_90_00_10, type = "response")

roc_obj_mod_90_00_10 <- train_model_90_00_10 %>%
  roc(response = is_popular, predictor = pred)


roc_curve <- ggroc(
  data = list(
    best_model = roc_obj_mod4
  ), 
  legacy.axes = TRUE,
  linewidth = 2) +
  coord_fixed() +
  labs(x = "1 - Specificity",
       y = "Sensitivity") +
  scale_colour_brewer(palette =  "Dark2") +
  scale_colour_discrete(labels = "Final Model") +
  theme_classic() +
  theme(axis.text  = element_text(face = "bold", size = 18, family = "Times"),
        axis.title  = element_text(face = "bold", size = 18, family = "Times"),
        title = element_text(face = "bold", size = 18, family = "Times"),
        legend.text = element_text(face = "bold", size = 16, family = "Times"),
        legend.title = element_blank()
  )

auc(roc_obj_mod_90_00_10)

roc_curve

ggsave("plot_images/log_best_model.png", dpi = 720, width = 12, height = 6)

```




